##### please load the correct libraries to run the codes successfully###

library(topicmodels)
library(dplyr)
#install.packages("readxl")
library(readxl)
library(qdapTools)
library(stm)
library(rJava)
#install.packages("Rwordseg", repos="http://R-Forge.R-project.org")
library(Rwordseg)
library(wordcloud)

library(stringr)
library(rJava)
library(tm)
library(wordcloud)

#install.packages("tmcn")
library(tmcn)
#install.packages("Rwordseg", repos="http://R-Forge.R-project.org")
#install.packages("sp")
library(sp)
library(rvest)
library(ggplot2)
library(ggpubr)
library(showtext) # Using Fonts More Easily in R Graphs
showtext_auto() 
library(stm)
library(tidyr)
library(gridExtra)
library(patchwork)
library(grid)
library(data.table)


####################################
##### Processing ##################
###################################
options(stringsAsFactors = F)
setwd("")
data<-read.csv("cleaned_data.csv", check.names = TRUE)

data<-data[,-1]

ins_temp<-sapply(data$`IC1.RP`, str_split,  ";") %>% lapply(as.data.frame) %>% lapply(t) %>%lapply(as.data.frame)

ins_temp<-rbindlist(ins_temp, fill=T)  %>% mutate(No=1:nrow(data))



ins_all_data<-pivot_longer(ins_temp, !No, names_prefix="V") %>% drop_na(value)
ins_all_data<-full_join(ins_all_data, data)
#ins_all_data<-rename(ins_all_data, institution="value")
colnames(ins_all_data)[3]<-"institution"

ins_all_data<-ins_all_data %>% mutate(ins=str_split(ins_all_data$institution, ",", simplify=T)[,1], 
                                      country=str_split(ins_all_data$institution, ",", simplify=T)[,2])


ins_all_data<-ins_all_data %>% filter(!str_detect(ins, "[:digit:]"))  %>% 
  filter(!str_detect(ins, "^Dept"), ins!="B")  %>% 
  mutate(ins= case_when(
    str_detect(ins, "^GIGA")~"GIGA", 
    str_detect(ins, "E China Normal Univ")~"East China Normal Univ",
    str_detect(ins, "S China Normal Univ")~"South China Normal Univ",
    str_detect(ins, "Fudan Univ China")~"Fudan Univ",
    str_detect(ins, "Chinese Acad Social Sci")~"China Acad Social Sci",
    str_detect(ins, "CASS")~"China Acad Social Sci",
    str_detect(ins, "Renmin")~"Renmin Univ China",
    str_detect(ins, "Peoples Univ")~"Renmin Univ China",
    str_detect(ins, "Remin")~"Renmin Univ China",
    str_detect(ins, "Harvards")~"Harvard Univ", 
    str_detect(ins, "^UCLA")~"Univ Calif Los Angeles", 
    str_detect(ins, "^London Sch Econ")~"London Sch Econ", 
    str_detect(ins, "^LSE")~"London Sch Econ",
    str_detect(ins,"Qinghua")~"Tsinghua Univ",
    TRUE~as.character(ins)
  ) 
  ) 

###  Hong KONG and Macau
ins_all_data<-ins_all_data %>% mutate(hk=ifelse(str_detect(ins_all_data$institution, "Hong Kong"), 1, 0), 
                                      macau=ifelse(str_detect(ins_all_data$institution, "Macau"), 1, 0), 
                                      )
ins_all_data<-ins_all_data %>% mutate(country=case_when(hk==1~"Hong Kong", macau==1~"Macau", TRUE~as.character(country)))



##### Part 1 Basic Sections#####

#### Manuscript Table 1 

#journal
length(unique(data$SO[data$ir==1]))
length(unique(data$SO[data$psc==1]))
length(unique(data$SO[data$area==1]))
#articles
nrow(data[data$ir==1,])
nrow(data[data$psc==1,])
nrow(data[data$area==1,])
#citation
sum(data$TC[data$ir==1])
sum(data$TC[data$psc==1])
sum(data$TC[data$area==1])
#std cite per journal
sum(data$cite_std[data$ir==1])/length(unique(data$SO[data$ir==1]))
sum(data$cite_std[data$psc==1])/length(unique(data$SO[data$psc==1]))
sum(data$cite_std[data$area==1])/length(unique(data$SO[data$area==1]))
#cite per article
sum(data$TC[data$ir==1])/nrow(data[data$ir==1,])
sum(data$TC[data$psc==1])/nrow(data[data$psc==1,])
sum(data$TC[data$area==1])/nrow(data[data$area==1,])




##############################
#### Appendix Figure 1-3 #######
#############################

## created with Excel###


##############################
#### Appendix Figure 4 #######
#############################

## proportion of articles ##
country_data<-ins_all_data %>% group_by(country, PY) %>%summarise(n=n())
country_data<-country_data %>% group_by(PY) %>% mutate(year_total=sum(n)) %>% mutate(c_p=n/year_total)
proportion_graph<-ggplot(data=filter(country_data, country=="China" | country=="USA" | country=="Hong Kong" | country=="Taiwan"  ), aes(x=PY, y=c_p, color=country, linetype=country)) +
  geom_line(size=1.5)+
  xlab("") + 
  ylab("") + 
  scale_x_continuous(breaks=seq(2001, 2020, 5))+
  ggtitle("Proportion of Articles")+
  theme(axis.text.x= element_text( size=15, color="black"))+
  theme(axis.text.y= element_text( size=15, color="black" ))+
  theme(axis.title=element_text(size=14))+
  theme(plot.title = element_text(hjust = 0.5, size=20))+
  theme(legend.position="bottom", legend.title=element_blank(), 
        legend.key.width= unit(2, 'cm'), legend.text = element_text(size=15))




## proportion of citation ##
country_data<-ins_all_data %>% group_by(country, PY) %>%summarise(n=n(), tc=sum(TC), cite_std=sum(cite_std))
country_data<-country_data %>% group_by(PY) %>% mutate(year_total=sum(n), year_t_cite=sum(tc), year_t_sc=sum(cite_std)) %>% 
            mutate(pub_p=n/year_total, cite_p=tc/year_t_cite, sc_p=cite_std/year_t_sc, m_c=tc/n, m_sc=cite_std/n)
country_data_sub<-filter(country_data, country=="China" | country=="USA" | country=="Hong Kong" | country=="Taiwan"   )

cite_p_graph<-ggplot(data=filter(country_data_sub), aes(x=PY, y=cite_p, color=country, linetype=country)) +
  geom_line(size=1.5)+
  xlab("") + 
  ylab("") + 
  scale_x_continuous(breaks=seq(2001, 2020, 5))+
  ggtitle("Proportion of Total  Citations")+
  theme(axis.text.x= element_text( size=15, color="black"))+
  theme(axis.text.y= element_text( size=15, color="black" ))+
  theme(axis.title=element_text(size=14))+
  theme(plot.title = element_text(hjust = 0.5, size=20))+
  theme(legend.position="bottom", legend.title=element_blank(), 
        legend.key.width= unit(2, 'cm'), legend.text = element_text(size=15))



## proportion of standardized citation ##

sc_p_graph<-ggplot(data=filter(country_data_sub), aes(x=PY, y=cite_std, color=country, linetype=country)) +
  geom_line(size=1.5)+
  xlab("") + 
  ylab("") + 
  scale_x_continuous(breaks=seq(2001, 2020, 5))+
  ggtitle("Total Standardized  Citations")+
  theme(axis.text.x= element_text( size=15, color="black"))+
  theme(axis.text.y= element_text( size=15, color="black" ))+
  theme(axis.title=element_text(size=14))+
  theme(plot.title = element_text(hjust = 0.5, size=20))+
  theme(legend.position="bottom", legend.title=element_blank(), 
        legend.key.width= unit(2, 'cm'), legend.text = element_text(size=15))



##proportion of standardized per article##

c_pa_graph<-ggplot(data=filter(country_data_sub), aes(x=PY, y=m_sc, color=country, linetype=country)) +
  geom_line(size=1.5)+
  xlab("") + 
  ylab("") + 
  scale_x_continuous(breaks=seq(2001, 2020, 5))+
  ggtitle(" Standardized Citation per Article")+
  theme(axis.text.x= element_text( size=15, color="black"))+
  theme(axis.text.y= element_text( size=15, color="black" ))+
  theme(axis.title=element_text(size=14))+
  theme(plot.title = element_text(hjust = 0.5, size=20))+
  theme(legend.position="bottom", legend.title=element_blank(), 
        legend.key.width= unit(2, 'cm'), legend.text = element_text(size=15))





png("App_Figure_4.png", width=1600, height=1600)

grid.arrange(proportion_graph,
             cite_p_graph,
             sc_p_graph,
             c_pa_graph
)
dev.off()


#######################################################################################################


###################################################################################
##################################################################################
###################################################################################
#####################

#########################
## Institution level#####







## institution data by years##
ins_data<-ins_all_data %>% group_by(PY, institution)  %>%summarise(n=n(), tc=sum(TC), cite_std=sum(cite_std))
ins_data<-ins_data %>% group_by(PY) %>% mutate(year_total=sum(n), year_t_cite=sum(tc), year_t_sc=sum(cite_std)) %>% 
  mutate(pub_p=n/year_total, cite_p=tc/year_t_cite, sc_p=cite_std/year_t_sc, m_c=tc/n, m_sc=cite_std/n)


## extract institutional data vector##

ins_data_in<- ins_all_data %>%
                      group_by(institution)  %>% summarise(n=n(), tc=sum(TC), cite_std=sum(cite_std), 
                                                   h_index=sum(TC>=seq_along(TC)), country, ins
                                                   ) 

ins_data_in <-unique(ins_data_in) 
                            

ins_data_in<-ins_data_in %>%  mutate(ins_total=sum(n), ins_t_cite=sum(tc),  ins_t_sc=sum(cite_std)) %>% 
  mutate(pub_p=n/ins_total, cite_p=tc/ins_t_cite, sc_p=cite_std/ins_t_sc, m_c=tc/n, m_sc=cite_std/n)

ins_data_in<-ins_data_in %>% ungroup() %>% mutate(pub_norm=(n-mean(n))/sd(n), cite_norm=(tc-mean(tc))/sd(tc), 
                                    std_norm=(cite_std-mean(cite_std))/sd(cite_std), 
                                    mc_norm=(m_c-mean(m_c))/sd(m_c), 
                                    msc_norm=(m_sc-mean(m_sc))/sd(m_sc), 
                                    h_norm=(h_index-mean(h_index))/sd(h_index))


ins_data_in<-ins_data_in %>% mutate(country2=case_when( country=="China"~"Mainland China", 
                                                        country=="USA"~"USA", 
                                                        TRUE~"Other"
                                                      )
                                    )

vec_c<-ins_data_in %>% filter(country=="China") %>% arrange(desc(n)) %>% top_n(10, wt=n) %>% select(ins) %>% as.vector()
vec_usa<-ins_data_in %>% filter(country=="USA") %>% arrange(desc(n)) %>% top_n(10, wt=n) %>% select(ins) %>% as.vector()
vec_other<-ins_data_in %>% filter(country2=="Other") %>% arrange(desc(n)) %>% top_n(10, wt=n) %>% select(ins) %>% as.vector()
vec_all<-c(vec_c, vec_usa, vec_other) %>% unlist()

##################### years change 



## institution data by years##
ins_data<-ins_all_data %>% mutate(biyear=ifelse(PY<2011, "2001-2010", "2011-2020"))%>% group_by(biyear, ins)  %>%  ## no need to adjust the wrong country
                        summarise(n=n(), tc=sum(TC), cite_std=sum(cite_std), h_index=sum(TC>=seq_along(TC)), country, ins) %>% ungroup() %>% unique()
ins_data<-ins_data %>% group_by(biyear) %>% mutate(year_total=sum(n), year_t_cite=sum(tc), year_t_sc=sum(cite_std)) %>% 
  mutate(pub_p=n/year_total, cite_p=tc/year_t_cite, sc_p=cite_std/year_t_sc, m_c=tc/n, m_sc=cite_std/n)


ins_data<-ins_data %>% mutate(country2=case_when( country=="China"~"Mainland China", 
                                                        country=="USA"~"USA", 
                                                        TRUE~"Other"
)
)

ins_data<-select(ins_data, biyear, country, country2,  ins, pub_p, cite_p, cite_std, m_sc, h_index, )
ins_data_rank<-ins_data%>% ungroup() %>% mutate(match=ins_data$ins%in%vec_all==T)
ins_data_rank<-ins_data_rank %>% filter(match==T)   %>% distinct(biyear, ins, .keep_all = T)


ins_data_rank<-pivot_wider(ins_data_rank, names_from=biyear, values_from=pub_p:h_index)

ins_data_rank<-mutate_if(ins_data_rank, is.numeric , replace_na, replace = 0)

ins_data_rank<-filter(ins_data_rank, !(country=="Singapore" & ins=="Nanjing Univ")) ## a mismatch between country and institute


##### Manuscript Figure 1 ######

### pub

ins_data_rank<-mutate(ins_data_rank, pub_pd=`pub_p_2011-2020`-`pub_p_2001-2010`, 
                      cite_pd=`cite_p_2011-2020`-`cite_p_2001-2010`,
                      cite_stdd=`cite_std_2011-2020`-`cite_std_2001-2010`,
                      msc_d=`m_sc_2011-2020`-`m_sc_2001-2010`,
                      h_indexd=`h_index_2011-2020`-`h_index_2001-2010`
                      )








### Figure 1 Panel A###
rank<-ggplot(data=ins_data_rank, aes(x=`pub_p_2001-2010`, y=reorder(ins, `pub_p_2011-2020`)) )
pubgraph<-rank+
  geom_point(cex=3, shape=16, alpha=0.4)  +
  geom_point(cex=3, shape=16, alpha=0.9, aes(x=`pub_p_2011-2020`)) +
  facet_wrap(~country2, nrow = 3, scales = "free_y")+
  xlab("") + ylab("") +ggtitle("Publication Proportion")+
  scale_y_discrete(label=function(x) abbreviate(x, minlength=6))+
  theme(axis.text.x= element_text( size=8, color="black"))+
  theme(axis.text.y= element_text( size=12, color="black" ))+
  theme(axis.title=element_text(size=14))+
  theme(plot.title = element_text(hjust = 0.5, size=16))+
  theme(legend.position="bottom", legend.title=element_blank(), 
        legend.text = element_text(size=15))
pubgraph           

### Figure 1 Panel B###
rank<-ggplot(data=ins_data_rank, aes(x=`cite_p_2001-2010`, y=reorder(ins, `cite_p_2011-2020`)) )
citegraph<-rank+
  geom_point(cex=3, shape=16, alpha=0.4)  +
  geom_point(cex=3, shape=16, alpha=0.9, aes(x=`cite_p_2011-2020`)) +
  facet_wrap(~country2, nrow = 3, scales = "free_y")+
  xlab("") + ylab("") +ggtitle("Citation Proportion")+
  scale_y_discrete(label=function(x) abbreviate(x, minlength=6))+
  theme(axis.text.x= element_text( size=8, color="black"))+
  theme(axis.text.y= element_text( size=12, color="black" ))+
  theme(axis.title=element_text(size=14))+
  theme(plot.title = element_text(hjust = 0.5, size=16))+
  theme(legend.position="bottom", legend.title=element_blank(), 
        legend.text = element_text(size=12))
citegraph           

### Figure 1 Panel C###
rank<-ggplot(data=ins_data_rank, aes(x=`cite_std_2001-2010`, y=reorder(ins, `cite_std_2011-2020`)) )
citestdgraph<-rank+
  geom_point(cex=3, shape=16, alpha=0.4)  +
  geom_point(cex=3, shape=16, alpha=0.9, aes(x=`cite_std_2011-2020`)) +
  facet_wrap(~country2, nrow = 3, scales = "free_y")+
  xlab("") + ylab("") +ggtitle("Standardized Citation Change")+
  theme(axis.text.x= element_text( size=8, color="black"))+
  theme(axis.text.y= element_text( size=12, color="black" ))+
  scale_y_discrete(label=function(x) abbreviate(x, minlength=6))+
  theme(axis.title=element_text(size=14))+
  theme(plot.title = element_text(hjust = 0.5, size=16))+
  theme(legend.position="bottom", legend.title=element_blank(), 
        legend.text = element_text(size=15))
citestdgraph  


### Figure 1 Panel D###
rank<-ggplot(data=ins_data_rank, aes(x=`m_sc_2001-2010`, y=reorder(ins, `m_sc_2011-2020`)) )
m_scgraph<-rank+
  geom_point(cex=3, shape=16, alpha=0.4)  +
  geom_point(cex=3, shape=16, alpha=0.9, aes(x=`m_sc_2011-2020`)) +
  facet_wrap(~country2, nrow = 3, scales = "free_y")+
  xlab("") + ylab("") +ggtitle("Standardized Citation Per Article")+
  scale_y_discrete(label=function(x) abbreviate(x, minlength=6))+
  theme(axis.text.x= element_text( size=8, color="black"))+
  theme(axis.text.y= element_text( size=12, color="black" ))+
  theme(axis.title=element_text(size=14))+
  theme(plot.title = element_text(hjust = 0.5, size=16))+
  theme(legend.position="bottom", legend.title=element_blank(), 
        legend.text = element_text(size=15))
m_scgraph           



### Figure 1 Panel E###
rank<-ggplot(data=ins_data_rank, aes(x=`h_index_2001-2010`, y=reorder(ins, `h_index_2011-2020`)) )
h_indexgraph<-rank+
  geom_point(cex=3, shape=16, alpha=0.4)  +
  geom_point(cex=3, shape=16, alpha=0.9, aes(x=`h_index_2011-2020`)) +
  facet_wrap(~country2, nrow = 3, scales = "free_y")+
  xlab("") + ylab("") +ggtitle("H Index")+
  scale_y_discrete(label=function(x) abbreviate(x, minlength=6))+
  theme(axis.text.x= element_text( size=8, color="black"))+
  theme(axis.text.y= element_text( size=12, color="black" ))+
  theme(axis.title=element_text(size=14))+
  theme(plot.title = element_text(hjust = 0.5, size=16))+
  theme(legend.position="bottom", legend.title=element_blank(), 
        legend.text = element_text(size=15))
h_indexgraph           




layout<-rbind(c(1,2,3), 
              c(4,NA, 5)
              )



png("Manuscript_Figure_1.png", width=1200, height=1600)
(pubgraph | citegraph) / (  citestdgraph | m_scgraph | h_indexgraph ) +
  plot_annotation(tag_levels = 'A')

dev.off()

tiff("Manuscript_Figure_1.tiff", width=1200, height=1600)
(pubgraph | citegraph) / (  citestdgraph | m_scgraph | h_indexgraph ) +
  plot_annotation(tag_levels = 'A')

dev.off()



#### Manuscript Figure 2: created from  ####


### Appendix Figure 5 & Figure 6: from other sources and created by Excel


#### Appendix FIgure 7


cnus_data<-data %>% mutate(cnus=ifelse(china==1 & usa==1, 1, 0)) %>%
  group_by(PY) %>% summarise(cnus_n=sum(cnus))



cite_p_graph<-ggplot(data=cnus_data, aes(x=PY, y=cnus_n)) +
  geom_line(size=1.5)+
  xlab("") + 
  ylab("") + 
  scale_x_continuous(breaks=seq(2001, 2020, 5))+
  ggtitle("China-U.S. Collaboration (Articles)")+
  theme(axis.text.x= element_text( size=15, color="black"))+
  theme(axis.text.y= element_text( size=15, color="black" ))+
  theme(axis.title=element_text(size=14))+
  theme(plot.title = element_text(hjust = 0.5, size=20))+
  theme(legend.position="bottom", legend.title=element_blank(), 
        legend.key.width= unit(2, 'cm'), legend.text = element_text(size=15))

png("App_Figure_7.png", height=800, width=1200)
cite_p_graph
dev.off()



##### Appendix Table 1, 2 and Table 3
ins_all_data<-mutate(ins_all_data, china=case_when(
  country=="China"~1, 
  TRUE~0
), 
usa=case_when(
  country=="USA"~1, 
  TRUE~0  
)
)

data_cn<-select(ins_all_data, No, china) %>% filter(china==1) %>% distinct()

data_us<-select(ins_all_data, No, usa)  %>% filter(usa==1) %>% distinct()

data<-data%>%left_join(data_cn)
data<-data %>% left_join(data_us)

data<-data %>% mutate(china=ifelse(is.na(china), 0, china), 
                      usa=ifelse(is.na(usa), 0, usa)
)

library(jtools)
#install.packages("huxtable")
#install.packages("sandwich")
#install.packages(c("officer", "flextable))

data<-data %>% mutate(other=ifelse(china==0 & usa==0, 1, 0))
Standardized<-lm(data=data, cite_std~china+usa+psc+ir+area+as.factor(PY))
Total_Citation<-lm(data=data, TC~china+usa+psc+ir++area+as.factor(PY))

export_reg<-export_summs(Standardized, Total_Citation, 
                         model.names = c("Standardized Citation","Total Citation"),
                         coefs = c("Mainland China" = "china",
                                   "USA" = "usa",
                                   "Polisci" = "pscTRUE", 
                                   "IR" = "irTRUE", 
                                   "AREA" = "areaTRUE", 
                                   "Constant"="(Intercept)"
                         ),
                         robust = TRUE, to.file="docx", file.name="App_Table_1.docx")
###### number of country###

country<-as.data.frame(str_split_fixed(data$"CC1.RP", ";", n=15))
colnames(country)<-paste0("country_", 1:length(country))
country<-sapply(country, na_if, "")
data<-cbind(data,country)




options("jtools-digits" = 2)

count_na <- function(x) sum(is.na(x))  
ins_country<-select(data, No, country_1:country_15) %>% mutate_all( list(~na_if(.,""))) %>%
  mutate(collab_c=ifelse(is.na(country_2)==T, F, T))

ins_country$cna<-rowSums(apply(is.na(ins_country), 2, as.numeric))
ins_country$collab_num<-15-ins_country$cna

ins_country<-select(ins_country, No, collab_c, collab_num)

data<- data %>% left_join(ins_country)

collab_std<-lm(data=data, cite_std~china+collab_num+china*collab_num+psc+ir+area+as.factor(PY))
collab_tc<-lm(data=data, TC~china+collab_num+china*collab_num+psc+ir+area+as.factor(PY))
collab_std2<-lm(data=data, cite_std~china+collab_c+china*collab_c+psc+ir+area+as.factor(PY))
collab_tc2<-lm(data=data, TC~china+collab_c+china*collab_c+psc+ir+area+as.factor(PY))

export_collab_reg<-export_summs(collab_std, collab_tc, collab_std2, collab_tc2,
                                model.names = c("Standardized Citation","Total Citation", "Standardized Citation2","Total Citation2"),
                                coefs = c("Mainland China" = "china",
                                          "Number of Countries" = "collab_num",
                                          "Collaboration"= "collab_cTRUE", 
                                          "Polisci" = "pscTRUE", 
                                          "IR" = "irTRUE", 
                                          "AREA" = "areaTRUE", 
                                          "CHN#collb"="china:collab_cTRUE",
                                          "CHN#Countries"="china:collab_num",
                                          "Constant"="(Intercept)"
                                ),
                                robust = TRUE, to.file="docx", file.name="App_table_2.docx")



collab_std<-lm(data=data, cite_std~usa+collab_num+usa*collab_num+psc+ir+area+as.factor(PY))
collab_tc<-lm(data=data, TC~usa+collab_num+usa*collab_num+psc+ir+area+as.factor(PY))
collab_std2<-lm(data=data, cite_std~usa+collab_c+usa*collab_c+psc+ir+area+as.factor(PY))
collab_tc2<-lm(data=data, TC~usa+collab_c+usa*collab_c+psc+ir+area+as.factor(PY))

export_collab_reg<-export_summs(collab_std, collab_tc, collab_std2, collab_tc2,
                                model.names = c("Standardized Citation","Total Citation", "Standardized Citation2","Total Citation2"),
                                coefs = c("USA" = "usa",
                                          "Number of Countries" = "collab_num",
                                          "Collaboration"= "collab_cTRUE", 
                                          "Polisci" = "pscTRUE", 
                                          "IR" = "irTRUE", 
                                          "AREA" = "areaTRUE", 
                                          "USA#collb"="usa:collab_cTRUE",
                                          "USA#Countries"="usa:collab_num",
                                          "Constant"="(Intercept)"
                                ),
                                robust = TRUE, to.file="docx", file.name="App_table_3.docx")

##############################################################################
##############################################################################
##############################################################################




###########Part 2: Structural Topic Modeling

### topic modeling

ins_all_data<-mutate(ins_all_data, china=case_when(
                                  country=="China"~1, 
                                  TRUE~0
                                                  ), 
                                  usa=case_when(
                                  country=="USA"~1, 
                                  TRUE~0  
                                  )
                    )

data_merge<-select(ins_all_data, No, china, usa) 

data_merge<-data_merge%>% distinct(No, .keep_all=T)

data<-left_join(data,data_merge )


stopwords_add<-c("china", "china's","chinas", "chinese", "political", "politics", "article", "study", "studies", "research", "paper", 
                 "find", "argue", "scholar", "approach", "case", "will", "theory", "theoretical", "theorize", "argument", "propose", 
                 "framework", "model", "analysis", "analyze", "argu"
)


data_model<-select(data, AB, china, usa, PY, ir, psc, area, TC, cite_std, No) %>%
  mutate(period=ifelse(PY<2011, 0, 1)) %>%
  mutate(psc=ifelse(psc==T, 1, 0), ir=ifelse(ir==T, 1, 0), area=ifelse(area==T, 1, 0))

set.seed(20171119)
processed<-textProcessor(data_model$AB, metadata=data_model, customstopwords=stopwords_add)

out<-prepDocuments(processed$documents, processed$vocab, processed$meta)

docs<-out$documents
vocab<-out$vocab
meta<-out$meta


### 25 models  ###

#("D:/OneDrive - zju.edu.cn/polisci_quan/draft/draft_v2_doc/model25")

#setwd("C:/Users/DELL/OneDrive - zju.edu.cn/polisci_quan/draft/draft_v2_doc/model25") ##laptop



chn_scholar_prefit<-stm(out$documents, out$vocab, K=25, prevalence=~china*PY, data=out$meta, seed=20171119)








topic_labels<-c("Policy and Institution", "International Law", "Market and Economic Development", "Discourse and Culture", "Education and Students", 
                "Central_Local Governance", "Military and Security", "Public Attitude", "Environmental Politics", "Power and Hegemony", 
                "Global Order", "Taiwan Issue", "Research Method", "Democratization and Regime", "Korean Peninsula", 
                "Economic Growth", "Urban and Migration", "Nation, Ethnicity, Identity", "Asian Pacific", "Cold War and Communism", 
                "Foreign Aid and Investment", "Trade and Stocks", "Media and Internet", "Contentious Politics", "Banking and Finance"
)

raw<-chn_scholar_prefit$theta
colnames(raw)<-topic_labels


thought<-findThoughts(chn_scholar_prefit, texts=out$meta$AB, n=5, topics=14)$docs[[1]]


png("Manuscript_Figure_3.png", width=1200, height=900)
plot.STM(chn_scholar_prefit, type="summary", xlim=c(0,.2), topic.names = topic_labels, font=2)
abline(v=0.04, lty="dashed")
dev.off()

tiff("Manuscript_Figure_3.tiff", width=1200, height=900)
plot.STM(chn_scholar_prefit, type="summary", xlim=c(0,.2), topic.names = topic_labels, font=2)
abline(v=0.04, lty="dashed")
dev.off()


prep<-estimateEffect(1:25~china*PY+psc+ir+area, chn_scholar_prefit, meta=out$meta, uncertainty = "None")




png("App_Figure_8.png", width=900, height=900)
par(mfrow=c(5,5))
for(i in 1:25){
  plot.estimateEffect(prep,covariate="PY", model=chn_scholar_prefit, topics = i,  method="continuous", 
                      moderator="china", moderator.value=1, linecol="red", printlegend=F, main=topic_labels[i], 
                      xlab="", ylim=c(0,0.15), ylab="", yaxt = "n", xaxt="n" )
  plot.estimateEffect(prep,covariate="PY", model=chn_scholar_prefit, topics = i,  method="continuous", 
                      moderator="china", moderator.value=0, linecol="blue", add=T,  printlegend=F, yaxt = "n", xaxt="n")
  axis(2, at=c(0, 0.075,  0.15), , labels=c(0,"", 0.15))
  axis(1, at=c(2001, 2011, 2020))
  abline(h=0.04, lty="dotted", color="grey40" )
}
dev.off()








prep<-estimateEffect(1:25~PY+psc+ir+area, chn_scholar_prefit, meta=out$meta, uncertainty = "None")



png("Manuscrip_Figure_5.png", width=900, height=900)
par(mfrow=c(5,5))
for(i in 1:25){
  plot.estimateEffect(prep,covariate="PY", model=chn_scholar_prefit, topics = i,  method="continuous", 
                      linecol="black", printlegend=F, main=topic_labels[i], 
                      xlab="", ylim=c(0,0.15), ylab="", yaxt = "n", xaxt="n" )
  axis(2, at=c(0, 0.075,  0.15), , labels=c(0,"", 0.15))
  axis(1, at=c(2001, 2011, 2020))
  abline(h=0.04, lty="dotted", col="black")
}
dev.off()

tiff("Manuscrip_Figure_5.tiff", width=900, height=900)
par(mfrow=c(5,5))
for(i in 1:25){
  plot.estimateEffect(prep,covariate="PY", model=chn_scholar_prefit, topics = i,  method="continuous", 
                      linecol="black", printlegend=F, main=topic_labels[i], 
                      xlab="", ylim=c(0,0.15), ylab="", yaxt = "n", xaxt="n" )
  axis(2, at=c(0, 0.075,  0.15), , labels=c(0,"", 0.15))
  axis(1, at=c(2001, 2011, 2020))
  abline(h=0.04, lty="dotted", col="black")
}
dev.off()

##### Figure 7
prep<-estimateEffect(1:25~china*PY+psc+ir+area, chn_scholar_prefit, meta=out$meta, uncertainty = "None")

png("Manuscript_Figure_7.png", width=900, height=900)
plot.estimateEffect(prep, covariate="china", topics=1:25, model=chn_scholar_prefit, cov.value1=1, cov.value2=0,  xlim=c(-0.1,0.1), 
                    method="difference", xlab= "Mainland China - Other", labeltype="custom", custom.labels = topic_labels)                                  
dev.off()


tiff("Manuscript_Figure_7.tiff", width=900, height=900)
plot.estimateEffect(prep, covariate="china", topics=1:25, model=chn_scholar_prefit, cov.value1=1, cov.value2=0,  xlim=c(-0.1,0.1), 
                    method="difference", xlab= "Mainland China - Other", labeltype="custom", custom.labels = topic_labels)                                  
dev.off()


##### Figure 13



brewer.pal(n = 11, name = "Greys")[1:5]


library(RColorBrewer)
display.brewer.all()
set.seed(20171119)

pal<-brewer.pal(name="Blues", n=11)[c(3,5,7,9)]
set.seed(20171119)
png("App_Figure_13.png", width=1200, height=1200)
par(mfrow=c(5,5))
for(i in 1:25){
  file<-topic_labels[i]
  cloud(chn_scholar_prefit, topic = i, scale = c(2,1), max.words = 20, colors=pal)
  title(main=file)
}  
dev.off()




################## topic correlation
library(igraph)
set.seed(20171119)
mod.out.corr <- topicCorr(chn_scholar_prefit)


mean_raw<-colMeans(raw)
nb.cols <- 25
mycolors <- colorRampPalette(brewer.pal(9, "Blues")[1:6])(nb.cols)
palette(mycolors)

pal<-data.frame(no=1:25, mean_raw=mean_raw)
pal<-arrange(pal, mean_raw) %>% cbind.data.frame(mycolors)
pal<-arrange(pal, no)




png("Manuscript_Figure_4.png", width=1600, height=1600)
plot.topicCorr(mod.out.corr, topics=1:25, vlabels=topic_labels, vertex.label.cex=1.5, set.seed(20171119), vertex.color= pal$mycolors, vertex.size =mean_raw*200 )
dev.off()




### topic compares##


png("App_Figure_11.png", width=900, height=900)
plot(chn_scholar_prefit, type="perspectives", topics=c(3,16), plabels=c(topic_labels[3], topic_labels[16]))
dev.off()


##### topic thought####

for (i in 1:25) {
  
  filename<-paste0("thought", i, ".pdf")
  
  thought<-findThoughts(chn_scholar_prefit, texts=out$meta$AB, n=2, topics=i)$docs[[1]]
  
  pdf(filename, width = 1000, height=1600, paper="a4")  
  plotQuote(thought, width =  60, main=topic_labels[i])
  dev.off()  
  
}

plotQuote(thought, width =  806, height=1156, main=topic_labels[i])






pdf("App_Figure_10.pdf")

par(mfrow=c(2,2), mar=c(0.5, 0.5, 1, 0.5))

thought<-findThoughts(chn_scholar_prefit, texts=out$meta$AB, n=1, topics=3)$docs[[1]]
plotQuote(thought,width =  60, main="Market and Economic Development"  )
thought<-findThoughts(chn_scholar_prefit, texts=out$meta$AB, n=1, topics=16)$docs[[1]]
plotQuote(thought,width =  60, main="Economic Growth"  )
thought<-findThoughts(chn_scholar_prefit, texts=out$meta$AB, n=4, topics=22)$docs[[1]][2]
plotQuote(thought,width =  60, main="Trade and Stocks"  )
thought<-findThoughts(chn_scholar_prefit, texts=out$meta$AB, n=1, topics=25)$docs[[1]]
plotQuote(thought,width =  60, main="Banking and Finance"  )

dev.off()



#################################################################################
##################################################

## citation




prep<-estimateEffect(1:25~cite_std+ir+psc+area, chn_scholar_prefit, meta=out$meta, uncertainty = "None")



png("Manuscript_Figure_6.png", width=2000, height=2000)
par(mfrow=c(5,5))
for(i in 1:25){
  plot.estimateEffect(prep,covariate="cite_std", model=chn_scholar_prefit, topics = i,  method="continuous", 
                      linecol="black", printlegend=F, main=topic_labels[i], ylab="", cex.main = 3,
                      xlab="" , ylim=c(0,0.3))
  
}
dev.off()


tiff("Manuscript_Figure_6.tiff", width=2000, height=2000)
par(mfrow=c(5,5))
for(i in 1:25){
  plot.estimateEffect(prep,covariate="cite_std", model=chn_scholar_prefit, topics = i,  method="continuous", 
                      linecol="black", printlegend=F, main=topic_labels[i], ylab="", cex.main = 3,
                      xlab="" , ylim=c(0,0.3))
  
}
dev.off()




#### what china scholars published in IR journals
prep<-estimateEffect(1:25~china*ir, chn_scholar_prefit, meta=out$meta, uncertainty = "None")



ir_china<-cbind.data.frame(raw, meta$ir, meta$china) %>% rename("ir"="meta$ir", "china"="meta$china")
ir_china<-filter(ir_china,  china==1) %>% group_by(ir) %>% summarise(across(everything(), list(mean)))
ir_china<-ir_china %>%select(-china_1)
library(purrr)
library(forcats)
mean_china_ir<-gather(ir_china[ir_china$ir==0,], key=ir, value="prop") %>%mutate( ir=0, topic=colnames(ir_china[2:26])) 
mean_china_ir_1<-gather(ir_china[ir_china$ir==1,], key=ir, value="prop") %>%mutate( ir=1, topic=colnames(ir_china[2:26]))
mean_china_ir<-rbind.data.frame(mean_china_ir, mean_china_ir_1)
mean_china_ir$topic<-mean_china_ir$topic %>% gsub(pattern="_1", replacement="")
mean_china_ir$ir<-as.factor(mean_china_ir$ir) %>% fct_rev()
library(ggpubr)
png("App_Figure_9.png", width=1200, height=900)
ggdotchart(mean_china_ir, x="topic", y="prop", group="ir", 
           color="ir",
           palette=c("black", "grey"), 
           ggtheme=theme_pubr(), sorting="desc", 
           dot.size=2, 
           rotate=T, ylab="Proportion of Topics", 
 
           ) + 
          geom_hline(yintercept=0.04, color="black", linetype="longdash")+
          theme_cleveland()+
          theme(legend.position = "bottom")+
          labs(color='IR Journals?') 
dev.off()













####################################################
#################################################### 
#Part 3: Extra Information

### Appendix Table 4

data  %>% top_n(10, wt=TC) %>% arrange(desc(TC))


#### Appendix Table 5

data  %>% top_n(10, wt=cite_std) %>% arrange(desc(cite_std))

########Appendix Table 6

jn_cite<-data %>% group_by(SO) %>% summarise(total_cite=sum(TC), n=n(), 
                                             cite_std=sum(cite_std), SC=first(SC), ir=first(ir), area=first(area), psc=first(psc))
jn_cite<-jn_cite %>% mutate(qual=total_cite/n, std_qual=cite_std/n)

jn_cite %>% top_n(10, wt=n) %>% arrange(desc(n))

####### APpendix Table 7
jn_cite %>% top_n(10, wt=total_cite) %>% arrange(desc(total_cite))
###Appendix Table 8
jn_cite[jn_cite$n>10, ] %>% top_n(10, wt=qual) %>% arrange(desc(qual))
###Appendix Table 9
jn_cite[jn_cite$n>10, ] %>% top_n(10, wt=std_qual) %>% arrange(desc(std_qual))


#### Appendix Table 10
## using the dataset with reference information (not included in the replication data)
